In [1]:
# !sudo pip install catboost plotly

Обработка лидарных данных

Сегментация

Про лидар

In [2]:
from IPython.lib.display import YouTubeVideo
YouTubeVideo('Pa-q5elS_nE')
Out[2]:

А что за данные на самом деле

In [177]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

import plotly.offline as py
import plotly.figure_factory as ff
import plotly.graph_objs as go
import tqdm.notebook
py.init_notebook_mode(connected=True)

EQUAL_ASPECT_RATIO_LAYOUT = dict(
    margin={
        'l': 0,
        'r': 0,
        'b': 0,
        't': 0
    }, scene=dict(
    aspectmode='data'
))


def color(x, cmap='Reds'):
    cmap = plt.get_cmap(cmap)
    x = (x - np.min(x)) / np.max(x)
    
    return cmap(x)

%matplotlib inline
In [188]:
from joblib import Parallel, delayed, cpu_count
from sklearn.model_selection import train_test_split
In [343]:
ds = pd.read_csv('./snow.csv')
ds = ds.set_index(['scene_id'])
ds.head()
Out[343]:
x y z intensity ring label
scene_id
0 -11.355618 -4.206962 0.344085 0.0 23.0 1.0
0 -5.916535 -1.972164 0.283262 0.0 25.0 1.0
0 -7.410451 -2.113039 2.137792 0.0 31.0 1.0
0 -13.845870 -1.406652 0.406310 0.0 23.0 1.0
0 -8.326218 -0.346060 0.226469 0.0 22.0 1.0
  • intensity - ???
  • ring - ???

Кольцо

In [5]:
scene = ds.loc[0]

fig = go.Figure(layout=EQUAL_ASPECT_RATIO_LAYOUT)
fig.add_scatter3d(**{
    'x': scene.x,
    'y': scene.y,
    'z': scene.z,
    'mode': 'markers',
    'marker': {
        'size': 1,
        'color': color(scene.ring, 'tab20'),
    },
    'text': scene.ring
})

py.iplot(fig)

Интенсивность

In [6]:
fig = go.Figure(layout=EQUAL_ASPECT_RATIO_LAYOUT)
fig.add_scatter3d(**{
    'x': scene.x,
    'y': scene.y,
    'z': scene.z,
    'mode': 'markers',
    'marker': {
        'size': 1,
        'color': color(scene.intensity, 'seismic'),
    },
    'text': scene.intensity
})

py.iplot(fig)
In [7]:
scene = ds.loc[1]

fig = go.Figure(layout=EQUAL_ASPECT_RATIO_LAYOUT)
fig.add_scatter3d(**{
    'x': scene.x,
    'y': scene.y,
    'z': scene.z,
    'mode': 'markers',
    'marker': {
        'size': 1,
        'color': color(scene.intensity, 'seismic'),
    },
    'text': scene.ring
})

py.iplot(fig)

Отфильтруем снег

Эвристикой

In [8]:
def filter_by_intensity(intensity, limit=3):
    return intensity > limit

filtered_scene = scene[filter_by_intensity(scene.intensity)]


fig = go.Figure(layout=EQUAL_ASPECT_RATIO_LAYOUT)
fig.add_scatter3d(**{
    'x': filtered_scene.x,
    'y': filtered_scene.y,
    'z': filtered_scene.z,
    'mode': 'markers',
    'marker': {
        'size': 1,
        'color': color(filtered_scene.intensity, 'seismic'),
    },
    'text': scene.ring
})

py.iplot(fig)

Плохо и непонятно, будем учить

Облачные вычисления

In [317]:
from sklearn.neighbors import KDTree

class ComputeFeatures(object):
    def __init__(self, r=1.0):
        self.xyz = None
        self.intensity = None
        self.ring = None
        self.index = None
        self.r = r

    def _feature_names(self):
        return {
            'neighbours_in_ring': None, # количество соседей в окрестности с тем же номером кольца
            'neighbours_not_in_ring': None, # количетво соседей в окрестности с другим номером кольца
            'min_neighbours_intensity': None, # минимальная интенсивность соседей
            'max_neighbours_intensity': None, # максимальная интенсивность соседей
            'mean_neighbours_intensity': None, # средняя интенсивность соседей
            'std_neighbours_intensity': None, # отклонение интенсивности соседей
            'min_distance_to_neighbours': None, # минимальная дистанция до соседей
            'max_distance_to_neighbours': None, # максимальная дистанция до соседей
            'mean_distance_to_neighbours': None, # средняя дистанция до соседей
            'std_distance_to_neighbours': None # отклонение дистанции до соседей,
        }
        
        
    def compute_point_features(self, point_id, neighbours, distances):
        features = self._feature_names()
        neighbours_ring = self.ring[neighbours]
        features['neighbours_in_ring'] = len(np.where(neighbours_ring == self.ring[point_id])[0])
        features['neighbours_not_in_ring'] = len(neighbours_ring) - features['neighbours_in_ring']
        stats = [('min', np.min), ('max', np.max), ('mean', np.mean), ('std', np.std)]
        for name, func in stats:
            features[f'{name}_neighbours_intensity'] = func(self.intensity[neighbours])
            features[f'{name}_distance_to_neighbours'] = func(distances)
        return features
    
    def get_point_neighbours(self, point_id):
        return self.index.query_radius(self.xyz[point_id][np.newaxis, :], r=self.r, return_distance=True)
        
    def __call__(self, xyz, intensity, ring, label, ds_cols):
        self.xyz = xyz[:]
        self.intensity = intensity[:]
        self.ring = ring[:]
        
        self.index = KDTree(self.xyz)
        
        features = []
        for point_id in range(len(self.xyz)):
            neighbours, distances = self.get_point_neighbours(point_id)
            features.append(self.compute_point_features(point_id, neighbours[0], distances[0]))
        
        names = list(self._feature_names().keys()) + list(ds_cols)
        ds_data = pd.concat([pd.DataFrame(data=self.xyz), pd.DataFrame(data=intensity), pd.DataFrame(data=ring),
                             pd.DataFrame(data=label)], axis=1)
        features_data = pd.DataFrame(data=features)
        
        data = pd.concat([features_data, ds_data], axis=1)
        data = pd.DataFrame(data=data.values, columns=names)
        return data
In [318]:
# ds_features = pd.read_csv('./snow_features.csv')
# ds_features = ds_features.drop(["Unnamed: 0"], axis=1)
# ds_features.shape
In [322]:
R = 1.0

def process_scene(scene_id, scene, r):
    features = ComputeFeatures(r=r)
    features_df = \
        features(scene[['x', 'y', 'z']].values, scene.intensity.values, scene.ring.values, 
                 scene.label.values, scene.columns)
    features_df.to_csv('./features/{}.csv'.format(scene_id))
    
with Parallel(cpu_count()) as pool:
    pool(
        delayed(process_scene)(scene_id, scene=ds.loc[scene_id], r=R)
        for scene_id in tqdm.tqdm(ds.reset_index().scene_id.unique())
    )
  0%|          | 0/291 [00:00<?, ?it/s]
  1%|▏         | 4/291 [00:00<00:09, 29.23it/s]
  3%|▎         | 8/291 [00:23<08:31,  1.81s/it]
  4%|▍         | 12/291 [00:41<11:56,  2.57s/it]
  5%|▌         | 16/291 [00:58<14:13,  3.10s/it]
  7%|▋         | 20/291 [01:12<14:19,  3.17s/it]
  8%|▊         | 24/291 [01:36<18:09,  4.08s/it]
 10%|▉         | 28/291 [01:46<15:36,  3.56s/it]
 11%|█         | 32/291 [02:16<20:24,  4.73s/it]
 12%|█▏        | 36/291 [02:37<20:54,  4.92s/it]
 14%|█▎        | 40/291 [02:51<18:45,  4.48s/it]
 15%|█▌        | 44/291 [03:11<19:04,  4.63s/it]
 16%|█▋        | 48/291 [03:25<17:20,  4.28s/it]
 18%|█▊        | 52/291 [03:36<15:17,  3.84s/it]
 19%|█▉        | 56/291 [03:54<15:51,  4.05s/it]
 21%|██        | 60/291 [04:17<17:30,  4.55s/it]
 22%|██▏       | 64/291 [04:37<17:52,  4.73s/it]
 23%|██▎       | 68/291 [04:59<18:26,  4.96s/it]
 25%|██▍       | 72/291 [05:26<19:50,  5.44s/it]
 26%|██▌       | 76/291 [05:42<18:02,  5.04s/it]
 27%|██▋       | 80/291 [05:53<15:17,  4.35s/it]
 29%|██▉       | 84/291 [06:10<14:52,  4.31s/it]
 30%|███       | 88/291 [06:26<14:13,  4.21s/it]
 32%|███▏      | 92/291 [06:46<14:52,  4.48s/it]
 33%|███▎      | 96/291 [07:08<15:26,  4.75s/it]
 34%|███▍      | 100/291 [07:25<14:47,  4.65s/it]
 36%|███▌      | 104/291 [07:43<14:11,  4.55s/it]
 37%|███▋      | 108/291 [07:57<12:56,  4.24s/it]
 38%|███▊      | 112/291 [08:14<12:38,  4.24s/it]
 40%|███▉      | 116/291 [08:31<12:25,  4.26s/it]
 41%|████      | 120/291 [08:48<12:10,  4.27s/it]
 43%|████▎     | 124/291 [09:08<12:27,  4.47s/it]
 44%|████▍     | 128/291 [09:25<11:52,  4.37s/it]
 45%|████▌     | 132/291 [09:44<11:58,  4.52s/it]
 47%|████▋     | 136/291 [10:02<11:45,  4.55s/it]
 48%|████▊     | 140/291 [10:18<10:52,  4.32s/it]
 49%|████▉     | 144/291 [10:32<10:01,  4.09s/it]
 51%|█████     | 148/291 [10:51<10:17,  4.32s/it]
 52%|█████▏    | 152/291 [11:10<10:14,  4.42s/it]
 54%|█████▎    | 156/291 [11:25<09:34,  4.25s/it]
 55%|█████▍    | 160/291 [11:46<09:57,  4.56s/it]
 56%|█████▋    | 164/291 [11:55<08:08,  3.84s/it]
 58%|█████▊    | 168/291 [12:22<09:36,  4.69s/it]
 59%|█████▉    | 172/291 [12:40<09:18,  4.69s/it]
 60%|██████    | 176/291 [12:52<07:56,  4.14s/it]
 62%|██████▏   | 180/291 [13:07<07:24,  4.00s/it]
 63%|██████▎   | 184/291 [13:26<07:35,  4.26s/it]
 65%|██████▍   | 188/291 [13:39<06:45,  3.93s/it]
 66%|██████▌   | 192/291 [14:03<07:30,  4.55s/it]
 67%|██████▋   | 196/291 [14:22<07:18,  4.62s/it]
 69%|██████▊   | 200/291 [14:36<06:27,  4.26s/it]
 70%|███████   | 204/291 [15:00<06:56,  4.79s/it]
 71%|███████▏  | 208/291 [15:13<06:00,  4.34s/it]
 73%|███████▎  | 212/291 [15:41<06:49,  5.19s/it]
 74%|███████▍  | 216/291 [16:05<06:45,  5.41s/it]
 76%|███████▌  | 220/291 [16:18<05:38,  4.77s/it]
 77%|███████▋  | 224/291 [16:40<05:34,  4.99s/it]
 78%|███████▊  | 228/291 [17:00<05:14,  4.99s/it]
 80%|███████▉  | 232/291 [17:18<04:43,  4.81s/it]
 81%|████████  | 236/291 [17:29<03:51,  4.22s/it]
 82%|████████▏ | 240/291 [17:41<03:15,  3.83s/it]
 84%|████████▍ | 244/291 [18:04<03:26,  4.40s/it]
 85%|████████▌ | 248/291 [18:23<03:15,  4.56s/it]
 87%|████████▋ | 252/291 [18:45<03:08,  4.84s/it]
 88%|████████▊ | 256/291 [19:06<02:52,  4.93s/it]
 89%|████████▉ | 260/291 [19:28<02:37,  5.09s/it]
 91%|█████████ | 264/291 [19:40<02:01,  4.51s/it]
 92%|█████████▏| 268/291 [20:01<01:47,  4.68s/it]
 93%|█████████▎| 272/291 [20:19<01:28,  4.65s/it]
 95%|█████████▍| 276/291 [20:36<01:07,  4.51s/it]
 96%|█████████▌| 280/291 [20:54<00:49,  4.49s/it]
 98%|█████████▊| 284/291 [21:08<00:29,  4.21s/it]
100%|██████████| 291/291 [21:33<00:00,  4.45s/it]

Посмотрим на разметку

In [323]:
scene = ds.loc[1]

fig = go.Figure(layout=EQUAL_ASPECT_RATIO_LAYOUT)
fig.add_scatter3d(**{
    'x': scene.x,
    'y': scene.y,
    'z': scene.z,
    'mode': 'markers',
    'marker': {
        'size': 1,
        'color': color(scene.label, 'seismic'),
    },
    'text': scene.label
})

py.iplot(fig)
  5%|▌         | 15/291 [34:26<10:33:39, 137.75s/it]

Поучим что-нибудь

In [337]:
def get_features(ids):
    data = []
    for scene_id in ids:
        df = pd.read_csv(f'features/{scene_id}.csv')
        df['scene_id'] = [scene_id] * len(df)
        data.append(df)
    data = pd.concat(data, sort=False)
    return data.drop(['Unnamed: 0'], axis=1)
In [338]:
ids = ds.reset_index().scene_id.unique()
train_ids, test_ids = train_test_split(ids, test_size=0.2)
train_ids, val_ids = train_test_split(train_ids, test_size=0.3)
In [339]:
train = get_features(train_ids)
test = get_features(test_ids)
val = get_features(val_ids)
In [356]:
print(train.columns)
train
Index(['neighbours_in_ring', 'neighbours_not_in_ring',
       'min_neighbours_intensity', 'max_neighbours_intensity',
       'mean_neighbours_intensity', 'std_neighbours_intensity',
       'min_distance_to_neighbours', 'max_distance_to_neighbours',
       'mean_distance_to_neighbours', 'std_distance_to_neighbours', 'x', 'y',
       'z', 'intensity', 'ring', 'label', 'scene_id'],
      dtype='object')
Out[356]:
neighbours_in_ring neighbours_not_in_ring min_neighbours_intensity max_neighbours_intensity mean_neighbours_intensity std_neighbours_intensity min_distance_to_neighbours max_distance_to_neighbours mean_distance_to_neighbours std_distance_to_neighbours x y z intensity ring label scene_id
0 1.0 6.0 0.0 1.0 0.428571 0.494872 0.0 0.997845 0.630070 0.338248 -7.532469 -6.378462 0.692463 0.0 26.0 1.0 69
1 1.0 1.0 0.0 0.0 0.000000 0.000000 0.0 0.955845 0.477922 0.477922 -5.890380 -4.920837 1.637509 0.0 30.0 1.0 69
2 1.0 3.0 1.0 2.0 1.500000 0.500000 0.0 0.937783 0.587328 0.355844 -12.013908 -9.978291 0.313140 2.0 19.0 1.0 69
3 1.0 8.0 0.0 1.0 0.333333 0.471405 0.0 0.910951 0.624042 0.264682 -8.177974 -6.750264 0.491568 1.0 23.0 1.0 69
4 1.0 5.0 0.0 0.0 0.000000 0.000000 0.0 0.984880 0.623088 0.339762 -3.279662 -2.610378 0.162600 0.0 19.0 1.0 69
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
32226 24.0 79.0 3.0 47.0 18.631068 10.194221 0.0 0.993273 0.632233 0.237811 -11.303551 -11.923957 -0.765344 10.0 12.0 0.0 239
32227 26.0 32.0 4.0 12.0 8.258621 1.824738 0.0 0.999979 0.694685 0.264347 -12.056744 -11.533760 -0.874429 11.0 11.0 0.0 239
32228 15.0 116.0 3.0 47.0 18.030534 10.091898 0.0 0.988946 0.514570 0.236848 -10.447229 -12.158898 -0.653108 30.0 13.0 0.0 239
32229 18.0 10.0 7.0 17.0 9.535714 3.041172 0.0 0.997793 0.683699 0.298388 -15.813773 -13.712821 -0.730935 8.0 14.0 0.0 239
32230 8.0 12.0 10.0 17.0 13.400000 2.034699 0.0 0.944375 0.516763 0.253397 -14.372028 -15.160851 -0.486107 10.0 16.0 0.0 239

3807918 rows × 17 columns

In [340]:
import catboost

def learn(X_train, X_val, y_train, y_val):
    clf = catboost.CatBoostClassifier(n_estimators=100)
    clf.fit(
        X_train, y_train, early_stopping_rounds=10,
        use_best_model=True, eval_set=(X_val.values, y_val.values), plot=True, verbose=False)
    return clf

X_train = train.drop(["scene_id", "label", "x", "y", "z"], axis=1)
y_train = train.label


X_val = val.drop(["scene_id", "label", "x", "y", "z"], axis=1)
y_val = val.label
In [344]:
del ds
cls = learn(X_train, X_val, y_train, y_val)
In [345]:
X_test = test.drop(['scene_id', 'x', 'y', 'z', 'label'], axis=1)
y_test = test.label

from sklearn.metrics import precision_recall_curve, precision_score, recall_score

def test_one(clf, X_test, y_test):
    y_test_hat = clf.predict_proba(X_test)
    pr, rec, thr = precision_recall_curve(y_test, y_test_hat[:, 1])
    ix = np.linspace(1, len(pr)-1, num=2000).astype(int)
    return pr[ix], rec[ix], thr[ix - 1]


def heuristic_filter_scoring():
    pr = []
    rec = []
    filter_range = range(1, 10)
    for i in filter_range:
        y_test_heuristic_hat = np.ones(len(X_test))
        y_test_heuristic_hat[filter_by_intensity(test.intensity, i)] = 0
        pr.append(precision_score(y_test, y_test_heuristic_hat))
        rec.append(recall_score(y_test, y_test_heuristic_hat))
        
    return pr, rec, ' '.join(map(str, list(filter_range)))

pr_bl, rec_bl, thr_bl = heuristic_filter_scoring()

def plot_pr_rec(*models):
    traces = []
    for model, clf, X_test, y_test in models:
        pr, rec, thr = test_one(clf, X_test, y_test)
        pr_rec = go.Scattergl(x = rec, y = pr, mode='lines', text=thr, name=model)
        traces.append(pr_rec)

    pr_rec_bl = go.Scatter(x = rec_bl, y = pr_bl, mode='lines+markers', text=thr_bl, name='Intensity BL')

    layout = go.Layout(
        title='Precission-recall',
        xaxis=dict(
            title='Recall'
        ),
        yaxis=dict(
            title='Precission'
        ))
    fig = go.Figure(
        data=traces + [pr_rec_bl],
        layout=layout)
    py.iplot(fig)
    
models = [('Catboost classifier', cls, X_test, y_test)]
plot_pr_rec(*models)

Повизуализируем

In [346]:
y_test_hat = cls.predict_proba(test.drop(['scene_id', 'x', 'y', 'z', 'label'], axis=1))
In [347]:
itest = test.set_index(['scene_id'])
In [353]:
scene_id = 285
scene = itest.loc[scene_id]
scene_predictions = y_test_hat[test.scene_id == scene_id][:, 1]
In [383]:
fig = go.Figure(layout=EQUAL_ASPECT_RATIO_LAYOUT)

preds = (np.round(scene_predictions) == scene.label).astype('int')

colors = []
fig.add_scatter3d(**{
    'x': scene.x,
    'y': scene.y,
    'z': scene.z,
    'mode': 'markers',
    'marker': {
        'size': 3,
        'opacity': 1, 
        'color': color(preds, 'bwr'),
    },
    'text': [f'real: {target}, predicted: {pred}' for target, pred in zip(scene.label, preds)]

})
    

py.iplot(fig)
In [ ]:
 
In [ ]: